IMPORTS

Python modules used.

In [155]:
import pandas as pd
import plotly.graph_objects as go
import plotly.express as px
import plotly.offline as py
import datetime
import numpy as np
import math
import reverse_geocoder as rg
from scipy.stats import norm
from scipy.stats import expon
from scipy.stats import lognorm
from scipy.stats import weibull_min
from scipy.stats import weibull_max
from scipy.stats import alpha
from scipy.stats import beta
import scipy.stats as stats
%matplotlib inline
from scipy import stats
from fitter import Fitter
from IPython.display import Image
#plotly.offline.init_notebook_mode(connected=True)
py.init_notebook_mode(connected=False)
#ignore deprication warnings
import warnings
warnings.filterwarnings('ignore')
import operator
import collections
from scipy.optimize import minimize
In [156]:
def objfun(x,*args):
    #x = params
    params=x
    #*args = dist, data
    dist = args[0]
    data = args[1]
    weights = args[2]
    fit = [dist.cdf(j, *params) for j in sorted(weights)]
    err = checkfit(data,fit)
    return err
    
    
    
def checkfit(data,fit):
    err=0
    for i in range(len(data)):
        err+=(data[i]-fit[i])**2
    return err

IMPORT DATA

For data reports: https://www.nohrsc.noaa.gov/nsa/reports.html

For definition of terms https://www.nohrsc.noaa.gov/help/

In [157]:
#read in the Surface Water Equivalent Data
SWE = pd.read_csv('swe.csv') 

DATA CLEANING

In [158]:
# remove some unused columns
SWE = SWE.drop(["Unnamed: 0","Unnamed: 0.1", "Unnamed: 10", "Unnamed: 9","Zip_Code"], axis=1)

#remove the zeros
SWE = SWE[SWE.Amount>0]

# add columns for year,month,day
SWE['year'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).year
SWE['month'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).month
SWE['day'] = pd.DatetimeIndex(SWE['DateTime_Report(UTC)']).day

# Add a column that counts the number of entries from each station
SWE['StationCounts'] = SWE.groupby(['Station_Id'])['Station_Id'].transform('count')

# throw away the stations with less than 'mincount' data points?
mincount = 100
SWE = SWE[SWE['StationCounts']>mincount]

# Make a copy of the dataframe with only unique stations for plotting on a map.
SWE_Stations = SWE.drop_duplicates('Station_Id',keep='first')
In [159]:
#drop all stations outside of MA
#THIS TAKES A WHILE, ONLY USED FOR SHOWING THE MAP PLOT OF STATIONS

rows_to_delete=[]
for i, row in enumerate(SWE_Stations.itertuples(), 1):
    #print(i,row.Index)
    coords = (SWE_Stations.iloc[i-1]['Latitude'],SWE_Stations.iloc[i-1]['Longitude'])
    results = rg.search(coords)
    if results[0]['admin1'] != "Massachusetts":
        rows_to_delete.append(row.Index)
SWE_Stations = SWE_Stations.drop(rows_to_delete)
In [160]:
#plot the stations on a map
fig = px.scatter_mapbox(SWE_Stations, lon="Longitude", lat="Latitude", hover_name="Station_Id", hover_data=["Station_Id","StationCounts"],
                        zoom=4, height=300,color="StationCounts",color_continuous_scale=px.colors.sequential.Viridis)
fig.update_layout(mapbox_style="open-street-map")
fig.update_layout(margin={"r":0,"t":0,"l":0,"b":0})
print("Plot of sites in MA")
fig.show()
#plotly.io.write_image(fig,"fig1.png")
#Image('fig1.png')
Plot of sites in MA
In [161]:
#GET ANNUAL SWE MAXIMUMS (For each station)
#GET MONTHLY SWE MAXIMUMS (For each station)
#The entire row that governs is carried forward
    
MONTHLY_DATA=pd.DataFrame() 
ANNUAL_DATA=pd.DataFrame() 

#for all stations?
#for station in set(SWE_Stations['Station_Id']):

#select the station with the most data       
for station in ['GRFM3']:
    for yr in range(2006,2020):
        tmp1 = SWE[(SWE['year']==yr) & (SWE['Station_Id']==station)]
        try:
            ANNUAL_DATA = ANNUAL_DATA.append(tmp1.loc[tmp1['Amount'].idxmax()])
        except:
            pass
        
        for month in range(1,13):
            tmp2 = SWE[(SWE['year']==yr) & (SWE['month']==month) & (SWE['Station_Id']==station)]
            try:
                MONTHLY_DATA = MONTHLY_DATA.append(tmp2.loc[tmp2['Amount'].idxmax()])
            except:
                pass

            
#limit monthly data to full years
MONTHLY_DATA = MONTHLY_DATA[(MONTHLY_DATA['year']>2007)]            

#drop some columns no longer needed
ANNUAL_DATA = ANNUAL_DATA.drop(["StationCounts","Latitude", "Longitude"], axis=1)
MONTHLY_DATA = MONTHLY_DATA.drop(["StationCounts","Latitude", "Longitude"], axis=1)
In [162]:
#for the purposes of plotting put in zeros for months without data
for yr in range(2007,2020):
    for month in range(1,13):
        tmp = MONTHLY_DATA[(MONTHLY_DATA['year']==yr) & (MONTHLY_DATA['month']==month)]
        #print(tmp.shape[0])
        if tmp.shape[0]==0:
            #add a zero.
            #'2006-11-25 16'
            dfadd = pd.DataFrame([[yr,month,15,0,str(yr)+"-"+"%02i" %month+"-01 00"]], columns = ['year','month','day','Amount','DateTime_Report(UTC)'])
            MONTHLY_DATA=MONTHLY_DATA.append(dfadd)
MONTHLY_DATA = MONTHLY_DATA.sort_values(by=['DateTime_Report(UTC)'])
In [163]:
#Plot the data over time

fig = go.Figure()
fig.add_trace(go.Line(x=MONTHLY_DATA['DateTime_Report(UTC)'], y=MONTHLY_DATA['Amount'],
                    mode='lines',marker=dict(color='red',size=8),line_shape='hvh',
                    name='Monthly Maximum Data'))
fig.add_trace(go.Line(x=ANNUAL_DATA['DateTime_Report(UTC)'], y=ANNUAL_DATA['Amount'],
                    mode='lines',marker=dict(color='black',size=8),line_shape='hvh',
                    name='Annual Maximum Data'))

fig.update_layout(
    title="Annual & Monthly Maximum SWE Data",
    xaxis_title="Date",
    yaxis_title="SWE (inches of water)",
    template='plotly_white')
fig.show() 
#plotly.io.write_image(fig,"fig2.png")
#Image('fig2.png')
In [164]:
#remove the zeros
ANNUAL_DATA = ANNUAL_DATA[ANNUAL_DATA.Amount>0]
MONTHLY_DATA = MONTHLY_DATA[MONTHLY_DATA.Amount>0]

FIT A DISTRIBUTION (ANNUAL DATA)

Set up the fitting

In [165]:
swe_vals_annual = sorted(list(ANNUAL_DATA["Amount"]))
h20 = 5.202288 #psf per inch of depth
weight_vals_annual = [i*h20 for i in swe_vals_annual]
n=len(weight_vals_annual)

All Fits

In [180]:
#try the fitter module

data_array = np.asarray(weight_vals_annual)
f = Fitter(data_array,verbose=False)
f.fit()
f.summary()
Out[180]:
sumsquare_error
mielke 0.043758
f 0.043949
exponpow 0.044044
chi2 0.044219
betaprime 0.044277

Show the Fit

In [167]:
ErrorThreshold = 0.03

fig = go.Figure()
n = len(weight_vals_annual)
Pvals_annual = []
for i in range(len(sorted(weight_vals_annual))):
    Pvals_annual.append(i/(n+1))

DISTRIBUTIONS = [stats.alpha,stats.anglit,stats.arcsine,stats.argus,stats.beta,stats.betaprime,stats.bradford,stats.burr,stats.burr12,stats.cauchy,stats.chi,stats.chi2,stats.cosine,stats.crystalball,stats.dgamma,stats.dweibull,stats.erlang,stats.expon,stats.exponnorm,stats.exponpow,stats.exponweib,stats.f,stats.fatiguelife,stats.fisk,stats.foldcauchy,stats.foldnorm,stats.frechet_l,stats.frechet_r,stats.gamma,stats.gausshyper,stats.genexpon,stats.genextreme,stats.gengamma,stats.genhalflogistic,stats.genlogistic,stats.gennorm,stats.genpareto,stats.gilbrat,stats.gompertz,stats.gumbel_l,stats.gumbel_r,stats.halfcauchy,stats.halfgennorm,stats.halflogistic,stats.halfnorm,stats.hypsecant,stats.invgamma,stats.invgauss,stats.invweibull,stats.johnsonsb,stats.johnsonsu,stats.kappa3,stats.kappa4,stats.ksone,stats.kstwobign,stats.laplace,stats.levy,stats.levy_l,stats.levy_stable,stats.loggamma,stats.logistic,stats.loglaplace,stats.lognorm,stats.lomax,stats.maxwell,stats.mielke,stats.moyal,stats.nakagami,stats.ncf,stats.nct,stats.ncx2,stats.norm,stats.norminvgauss,stats.pareto,stats.pearson3,stats.powerlaw,stats.powerlognorm,stats.powernorm,stats.rayleigh,stats.rdist,stats.recipinvgauss,stats.reciprocal,stats.rice,stats.rv_continuous,stats.rv_histogram,stats.semicircular,stats.skewnorm,stats.t,stats.trapz,stats.triang,stats.truncexpon,stats.truncnorm,stats.tukeylambda,stats.uniform,stats.vonmises,stats.vonmises_line,stats.wald,stats.weibull_max,stats.weibull_min,stats.wrapcauchy]
DIST_NAMES = ['alpha','anglit','arcsine','argus','beta','betaprime','bradford','burr','burr12','cauchy','chi','chi2','cosine','crystalball','dgamma','dweibull','erlang','expon','exponnorm','exponpow','exponweib','f','fatiguelife','fisk','foldcauchy','foldnorm','frechet_l','frechet_r','gamma','gausshyper','genexpon','genextreme','gengamma','genhalflogistic','genlogistic','gennorm','genpareto','gilbrat','gompertz','gumbel_l','gumbel_r','halfcauchy','halfgennorm','halflogistic','halfnorm','hypsecant','invgamma','invgauss','invweibull','johnsonsb','johnsonsu','kappa3','kappa4','ksone','kstwobign','laplace','levy','levy_l','levy_stable','loggamma','logistic','loglaplace','lognorm','lomax','maxwell','mielke','moyal','nakagami','ncf','nct','ncx2','norm','norminvgauss','pareto','pearson3','powerlaw','powerlognorm','powernorm','rayleigh','rdist','recipinvgauss','reciprocal','rice','rv_continuous','rv_histogram','semicircular','skewnorm','t','trapz','triang','truncexpon','truncnorm','tukeylambda','uniform','vonmises','vonmises_line','wald','weibull_max','weibull_min','wrapcauchy']
skips = []
for i in range(len(DIST_NAMES)):
    #print(DIST_NAMES[i])
    if DIST_NAMES[i] not in skips:
        error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
        if np.isnan(error) or error>ErrorThreshold:
            visible='legendonly'
        else:
            visible=True
        
        try:
            name = DIST_NAMES[i]
            dist = DISTRIBUTIONS[i]
            params = f.fitted_param[name]

            yy = x=np.linspace(0.001,0.999,98)
            xx = [dist.ppf(i, *params) for i in yy]
            fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
                                mode='lines',hovertext="%.4f" % error,visible=visible,name=name))
        except KeyError:
            pass

        
fig.add_trace(go.Scatter(x=sorted(weight_vals_annual), y=Pvals_annual,
                    mode='markers',marker=dict(color='black',size=8),
                    name='data from station: GRFM3'))

fig.update_layout(
    title="Fitting Distributions to Annual Maximum Data",
    xaxis_title="Annual Maximum Measured Snow Weight (psf)",
    yaxis_title="Probability",
    template='plotly_white')
fig.update_xaxes(range=[0, 140])
fig.show() 
#plotly.io.write_image(fig,"fig3.png")
#Image('fig3.png')

These don't appear to fit very well, since the fit is based on the histogram (pdf) and there isn't enough data here to make a decent histogram. So solve for optimized parameters using my own objective function that compares CDF values.

Improve the Fit

In [168]:
#THIS TAKES A WHILE...

ERROR={}
NEW_PARAMS={}

for i in range(len(DIST_NAMES)):
    if DIST_NAMES[i] in f.fitted_param:
        name = DIST_NAMES[i]
        dist = DISTRIBUTIONS[i]
        params = f.fitted_param[name]
        sol = minimize(objfun, params, args=(dist,Pvals_annual,weight_vals_annual))
        error = sol.fun
        newparams = sol.x
        ERROR[name] = error
        NEW_PARAMS[name] = newparams
        
    #except:
    #    pass
ERROR = sorted(ERROR.items(), key=operator.itemgetter(1))
ERROR = collections.OrderedDict(ERROR)
In [169]:
fig = go.Figure()

ErrorThreshold=0.01

skips = []
for i in range(len(DIST_NAMES)):
    #try:
    if True:
        name = DIST_NAMES[i]
        if name not in skips and name in ERROR:
            error = ERROR[name]
            if np.isnan(error) or error>ErrorThreshold:
                visible='legendonly'
            else:
                visible=True
                
            dist = DISTRIBUTIONS[i]
            params = NEW_PARAMS[name]
            yy = x=np.linspace(0.001,0.999,98)
            xx = [dist.ppf(i, *params) for i in yy]
            fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
                                mode='lines',hovertext="%.4f" % error,visible=visible,name=name))
fig.add_trace(go.Scatter(x=sorted(weight_vals_annual), y=Pvals_annual,
                    mode='markers',marker=dict(color='black',size=8),
                    name='data from station: GRFM3'))
fig.update_layout(
    title="Fitting Distributions to Annual Maximum Data",
    xaxis_title="Annual Maximum Measured Snow Weight (psf)",
    yaxis_title="Probability",
    template='plotly_white')
fig.update_xaxes(range=[0, 140])
fig.show() 

Estimate 50yr MRI

In [170]:
#use the best fit distributions to estimate the 50yr MRI event. 
ErrorThreshold = 0.009
best_fits=[]
for i in range(len(DIST_NAMES)):
    try:
        if DIST_NAMES[i] not in skips:
            #error = f.df_errors['sumsquare_error'][DIST_NAMES[i]]
            error = ERROR[DIST_NAMES[i]]
            #print()
            if not np.isnan(error) and error<ErrorThreshold:
                #print(DIST_NAMES[i],error)
                best_fits.append(DIST_NAMES[i])
    except KeyError:
        pass
#print(best_fits)
print("50yr MRI Snow Load from Annual Maximums")
for fit in best_fits:
    i = DIST_NAMES.index(fit)
    dist = DISTRIBUTIONS[i]
    params = NEW_PARAMS[fit]
    yy = 0.98
    xx = dist.ppf(yy, *params)
    print("%20s"%fit,"\t","%.2f" %xx,"psf")
50yr MRI Snow Load from Annual Maximums
                beta 	 134.65 psf
          gausshyper 	 137.97 psf
           johnsonsb 	 131.89 psf
              kappa4 	 133.04 psf

FIT A DISTRIBUTION (MONTHLY DATA)

In [171]:
#repeat with monthly data
swe_vals_monthly = sorted(list(MONTHLY_DATA["Amount"]))
h20 = 5.202288 #psf per inch of depth
weight_vals_monthly = [i*h20 for i in swe_vals_monthly]
n=len(weight_vals_monthly)   
In [172]:
Pvals_monthly = []
X = []
n = len(weight_vals_monthly)
for i in range(len(sorted(weight_vals_monthly))):
    Pvals_monthly.append(i/(n+1))
    X.append(weight_vals_monthly[i])

ERROR_monthly={}
NEW_PARAMS_monthly={}

for i in range(len(DIST_NAMES)):
    try:
        name = DIST_NAMES[i]
        dist = DISTRIBUTIONS[i]
        params = f.fitted_param[name]      

        sol = minimize(objfun, params, args=(dist,Pvals_monthly,weight_vals_monthly))
        error = sol.fun
        newparams = sol.x
        ERROR_monthly[name] = error
        NEW_PARAMS_monthly[name] = newparams
        
    except:
        pass
ERROR_monthly = sorted(ERROR_monthly.items(), key=operator.itemgetter(1))
ERROR_monthly = collections.OrderedDict(ERROR_monthly)
In [174]:
fig = go.Figure()

ErrorThreshold = 0.033

skips = []
for i in range(len(DIST_NAMES)):
    name = DIST_NAMES[i]
    if name not in skips and name in ERROR_monthly:
        error = ERROR_monthly[name]
        if np.isnan(error) or error>ErrorThreshold:
            visible='legendonly'
        else:
            visible=True
    
        dist = DISTRIBUTIONS[i]
        params = NEW_PARAMS_monthly[name]
        yy = np.linspace(0.001,0.999,98)
        xx = [dist.ppf(i, *params) for i in yy]
        fig.add_trace(go.Scatter(x=xx, y=yy, line=dict(width=1),hoverinfo='name+text',
                            mode='lines',hovertext="%.4f" % error,visible=visible,name=name))
        
fig.add_trace(go.Scatter(x=sorted(weight_vals_monthly), y=Pvals_monthly,
                    mode='markers',marker=dict(color='black',size=8),
                    name='data from station: GRFM3'))

fig.update_layout(
    title="Fitting Distributions to Monthly Maximum Data",
    xaxis_title="Monthly Maximum Measured Snow Weight (psf)",
    yaxis_title="Probability",
    template='plotly_white')
fig.update_xaxes(range=[0, 140])
fig.show() 
#plotly.io.write_image(fig,"fig4.png")
#Image('fig4.png')
In [176]:
#use these distributions to estimate the 50yr MRI event. 

ErrorThreshold = 0.03
best_fits=[]
for i in range(len(DIST_NAMES)):
    name = DIST_NAMES[i]
    if DIST_NAMES[i] not in skips and name in ERROR_monthly:
        error = ERROR_monthly[name]
        if not np.isnan(error) and error<ErrorThreshold:
            
            best_fits.append(DIST_NAMES[i])

#annual
#  0.98 = 1 - (1/50)

#monthly (considering 12 months per year)
#  0.9983 = 1 - (1/(12*50)

#monthly (considering 2.4 months per year)
#  0.9917 = 1 - (1/(2.4*50)

months_per_year = MONTHLY_DATA.shape[0]/(MONTHLY_DATA['year'].max()-MONTHLY_DATA['year'].min()+1)
yy = 1 - (1/(months_per_year*50))
#print(months_per_year,yy,"\n\n")
print("\t\t50yr MRI Snow Load from Monthly Maximums")
for fit in best_fits:
    i = DIST_NAMES.index(fit)
    dist = DISTRIBUTIONS[i]
    params = NEW_PARAMS_monthly[fit]
    xx = dist.ppf(yy, *params)
    print("%20s"%fit,"\t","%.2f" %xx,"psf")
		50yr MRI Snow Load from Monthly Maximums
                fisk 	 728.70 psf
          genextreme 	 796.59 psf
          halfcauchy 	 1155.12 psf
            invgamma 	 756.81 psf
          invweibull 	 796.80 psf
          loglaplace 	 2667.02 psf

Discussion

These values are so high, I may have something wrong in here.

ASCE 7 ground snow load is 50psf. I'm estimating 130 psf with annual data 700+ psf with monthly data.